Loading libraries
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(ggplot2)
library(countrycode)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Sys.setenv(MAPBOX_TOKEN = 11122223333444) #presevents the mapbox token error
data = read.csv('owid-covid-data.csv')
#head(data)
Data preprocessing, checking on the column names and determine the
important ones to help me in achieving the objectives, checking for
null/NAN values.
Write the column names to a text file for my reference in the
analysis
columns <- colnames(data)
file_columns<-file("columns.txt")
writeLines(c(columns), file_columns)
close(file_columns) #close the file
Check columns with the null values
#which(is.na(data))
#cols_with_na <- which(apply(data, 2, function(x) any(is.na(x))))
#colnames(data)[cols_with_na] #this indicates that atleast there are missing records in one or more rows in each feature included in the dataset.Except iso_code, continent, location and date
#colSums(is.na(data))
Visualizing the total reported cases since the start of the
pandemic
df <- data
# show difference between paths and lines
p <- df %>%
arrange(total_cases) %>%
plot_ly(x = ~date, y = ~total_cases)
add_lines(p)
Considering the new tests
p <- df %>%
arrange(new_tests) %>%
plot_ly(x = ~date, y = ~new_tests) %>%
add_lines(p)
A notable issue for the visualization is that although there were
millions of reported cases, the number of new tests was relatively low,
implying that not all countries could have accounted to new tests but
had highest number of reported cases.
To confirm the validity of this conclusion, the following
visualization digs deep into highlighting the top countries with most
cases and most new tests
#group the dataset into years and months
dates <- c(df$date)
months <- month(ymd(dates))
years <- year(ymd(dates))
#new df
new_df <- data.frame(Month = months, Year = years, Continent = df$continent, total_cases = df$total_cases)
#create a plot for top 10 continents
top10 <- new_df %>%
group_by(Continent, Year) %>%
summarise(totals = sum(total_cases, na.rm = TRUE)) %>%
filter(!is.na(Continent)) %>%
arrange(desc(totals)) %>%
top_n(10)
## `summarise()` has grouped output by 'Continent'. You can override using the
## `.groups` argument.
## Selecting by totals
top10 %>%
plot_ly(x = ~Continent, y = ~totals, type = "bar")
#use the new datasets with fixed latitudes for better map visualization.
new_data <- read.csv("coronavirus.csv")
save(new_data, file = "corona.RData")
head(new_data)
## date province country lat long type cases uid iso2 iso3
## 1 2020-01-22 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 2 2020-01-23 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 3 2020-01-24 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 4 2020-01-25 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 5 2020-01-26 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## 6 2020-01-27 Alberta Canada 53.9333 -116.5765 confirmed 0 12401 CA CAN
## code3 combined_key population continent_name continent_code
## 1 124 Alberta, Canada 4413146 North America <NA>
## 2 124 Alberta, Canada 4413146 North America <NA>
## 3 124 Alberta, Canada 4413146 North America <NA>
## 4 124 Alberta, Canada 4413146 North America <NA>
## 5 124 Alberta, Canada 4413146 North America <NA>
## 6 124 Alberta, Canada 4413146 North America <NA>
#get the leading countries in terms of the number of cases confirmed
filtered_df <- new_data %>%
filter(type == "confirmed") %>%
group_by(country) %>%
summarise(sum_Cases = sum(cases)) %>%
arrange(-sum_Cases)%>%
top_n(10) #order the total in ascending order
## Selecting by sum_Cases
plt <- ggplot(filtered_df, aes(x = country, y=sum_Cases)) + geom_bar(stat="identity", fill="steelblue")+
theme_minimal()
ggplotly(plt)
#creating a chroloplot
dates = c(new_data$date)
Date = ymd(dates)
#new_data$code3 <- countrycode(new_data$country, "country.name", "iso3c") #convert the names into 3 letter country codes
cases_summed <- new_data %>%
mutate(Year = year(Date), Month = month(Date)) %>%
mutate(YearMonth = paste(format(as.Date(paste(Month, 1, Year), "%m %d %Y"), "%b %Y"), sep = " ")) %>%
group_by(country, YearMonth, iso3) %>%
summarise(cases_sum = sum(cases))
## `summarise()` has grouped output by 'country', 'YearMonth'. You can override
## using the `.groups` argument.
#iso3 is used for the country codes.
cases_summed %>%
plot_ly(z = ~cases_sum, text = ~country, locations = ~iso3, locationmode = "ISO-3", type = "choropleth") %>%
colorbar(title = "Total Covid-reported cases ") %>%
layout(title = "Cases by Country", geo = list(showframe = FALSE, showcoastlines = FALSE),
coloraxis = list(colorscale = c("yellow", "red")))